En este notebook vamos a realizar un análisis exploratorio de datos (EDA) con el objetivo de preparlos para poder predecir quién contratará un seguro de autocaravanas.
#Comprobamos environment
!conda info
active environment : base
active env location : /opt/anaconda3
shell level : 1
user config file : /Users/adrianadevicente/.condarc
populated config files : /Users/adrianadevicente/.condarc
conda version : 22.9.0
conda-build version : 3.22.0
python version : 3.9.13.final.0
virtual packages : __osx=10.16=0
__unix=0=0
__archspec=1=x86_64
base environment : /opt/anaconda3 (writable)
conda av data dir : /opt/anaconda3/etc/conda
conda av metadata url : None
channel URLs : https://repo.anaconda.com/pkgs/main/osx-64
https://repo.anaconda.com/pkgs/main/noarch
https://repo.anaconda.com/pkgs/r/osx-64
https://repo.anaconda.com/pkgs/r/noarch
package cache : /opt/anaconda3/pkgs
/Users/adrianadevicente/.conda/pkgs
envs directories : /opt/anaconda3/envs
/Users/adrianadevicente/.conda/envs
platform : osx-64
user-agent : conda/22.9.0 requests/2.28.1 CPython/3.9.13 Darwin/21.4.0 OSX/10.16
UID:GID : 501:20
netrc file : None
offline mode : False
# Comprobamos environment
import sys
print(sys.version)
print(sys.path)
print("---")
print(sys.executable)
3.9.13 (main, Oct 13 2022, 16:12:30) [Clang 12.0.0 ] ['/Users/adrianadevicente/Downloads/Practica_EDA/notebooks', '/opt/anaconda3/envs/Practica0/lib/python39.zip', '/opt/anaconda3/envs/Practica0/lib/python3.9', '/opt/anaconda3/envs/Practica0/lib/python3.9/lib-dynload', '', '/opt/anaconda3/envs/Practica0/lib/python3.9/site-packages'] --- /opt/anaconda3/envs/Practica0/bin/python
#Funciones a usar en el análisis
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_outlier_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores outliers')
return pd_final
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def cramers_v(confusion_matrix):
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
#Cargamos todas las librerias
import pandas as pd
import requests
import pandas_profiling
from pandas_profiling.utils.cache import cache_file
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
silhouette_score, recall_score, precision_score, make_scorer, \
roc_auc_score, f1_score, precision_recall_curve, accuracy_score, roc_auc_score, \
classification_report, confusion_matrix
#Creamos una lista con los titulos de las columnas, las 86 variables
list_columns = ['customer_subtype', 'houses', 'avg_size_household', 'avg_age', 'customer_type', 'roman_catholic',
'protestant', 'other_religion', 'no_religion', 'married', 'living_together', 'other_relation', 'singles',
'no_children', 'with_children', 'high_education', 'medium_education', 'lower_education', 'high_status', 'entrepreneur', 'farmer',
'middle_management', 'skilled_labourers', 'unskilled_labourers', 'socialclass_A', 'socialclass_B1',
'socialclass_B2', 'socialclass_C', 'socialclass_D', 'rented_house', 'home_owners', '1car', '2cars', 'no_car',
'national_health_service', 'private_health_service', 'income_30', 'income_30_45', 'income_45_75',
'income_75_122', 'income_123', 'average_income', 'purchasing_power_class', 'contrib_private', 'contrib_firms',
'contrib_agriculture', 'contrib_car_policies', 'contrib_van_policies', 'contrib_scooter_policies',
'contrib_lorry_policies', 'contrib_trailer_policies', 'contrib_tractor_policies', 'contrib_agricultural_policies',
'contrib_moped_policies', 'contrib_life_insur', 'contrib_privateaccident', 'contrib_familyaccidents',
'contrib_disability', 'contrib_fire', 'contrib_surfboard', 'contrib_boat', 'contrib_bicycle',
'contrib_property', 'contrib_socialsecurity', 'number_private_insurance', 'number_insurance_firms',
'number_insurance_agriculture', 'num_car_policies', 'num_van_policies', 'num_scooter_policies',
'num_lorry_policies', 'num_trailer_policies', 'num_tractor_policies', 'num_agricultural_policies',
'num_moped_policies', 'num_life_insurances', 'num_private_insurance_policies', 'num_family_insurance_policies',
'num_disability_policies', 'num_fire_policies', 'num_surfboard_policies', 'num_boat_policies',
'num_bicycle_policies', 'num_property_policies', 'num_social_insurance_policies', 'num_mobilehomepolicies']
#Cargamos los datos de entrenamiento, estableciendo que el delimitador es tabulador, no tiene encabezado y los nombres de
#las columnas son la lista que acabamos de crear
data_train = pd.read_csv("../data/raw/ticdata2000.txt", delimiter = "\t", skiprows=0, header=None, names = list_columns)
#Cargamos los datos de test, con los mismos parámetros
data_test = pd.read_csv("../data/raw/ticeval2000.txt", delimiter = "\t", skiprows=0, header=None, names = list_columns[0:85])
#Cargamos los datos de target
target_test = pd.read_csv("../data/raw/tictgts2000.txt", skiprows=0, header= None)
#Cabecera para un vistazo general a la tabla
data_train.head()
| customer_subtype | houses | avg_size_household | avg_age | customer_type | roman_catholic | protestant | other_religion | no_religion | married | ... | num_private_insurance_policies | num_family_insurance_policies | num_disability_policies | num_fire_policies | num_surfboard_policies | num_boat_policies | num_bicycle_policies | num_property_policies | num_social_insurance_policies | num_mobilehomepolicies | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 33 | 1 | 3 | 2 | 8 | 0 | 5 | 1 | 3 | 7 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 37 | 1 | 2 | 2 | 8 | 1 | 4 | 1 | 4 | 6 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 37 | 1 | 2 | 2 | 8 | 0 | 4 | 2 | 4 | 3 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 9 | 1 | 3 | 3 | 3 | 2 | 3 | 2 | 4 | 5 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 40 | 1 | 4 | 2 | 10 | 1 | 4 | 1 | 4 | 7 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 86 columns
#El primer shape para ver las columnas y filas totales y el segundo para ver los duplicados de los tres ficheros
print(data_train.shape, data_train.drop_duplicates().shape)
print(data_test.shape, data_test.drop_duplicates().shape)
print(target_test.shape, target_test.drop_duplicates().shape)
(5822, 86) (5220, 86) (4000, 85) (3693, 85) (4000, 1) (2, 1)
El fichero de train que es el que usaremos en este análisis tiene 86 columnas (variables) y 5822 filas (individuos)
#Tipos de datos
data_train.dtypes.to_dict()
{'customer_subtype': dtype('int64'),
'houses': dtype('int64'),
'avg_size_household': dtype('int64'),
'avg_age': dtype('int64'),
'customer_type': dtype('int64'),
'roman_catholic': dtype('int64'),
'protestant': dtype('int64'),
'other_religion': dtype('int64'),
'no_religion': dtype('int64'),
'married': dtype('int64'),
'living_together': dtype('int64'),
'other_relation': dtype('int64'),
'singles': dtype('int64'),
'no_children': dtype('int64'),
'with_children': dtype('int64'),
'high_education': dtype('int64'),
'medium_education': dtype('int64'),
'lower_education': dtype('int64'),
'high_status': dtype('int64'),
'entrepreneur': dtype('int64'),
'farmer': dtype('int64'),
'middle_management': dtype('int64'),
'skilled_labourers': dtype('int64'),
'unskilled_labourers': dtype('int64'),
'socialclass_A': dtype('int64'),
'socialclass_B1': dtype('int64'),
'socialclass_B2': dtype('int64'),
'socialclass_C': dtype('int64'),
'socialclass_D': dtype('int64'),
'rented_house': dtype('int64'),
'home_owners': dtype('int64'),
'1car': dtype('int64'),
'2cars': dtype('int64'),
'no_car': dtype('int64'),
'national_health_service': dtype('int64'),
'private_health_service': dtype('int64'),
'income_30': dtype('int64'),
'income_30_45': dtype('int64'),
'income_45_75': dtype('int64'),
'income_75_122': dtype('int64'),
'income_123': dtype('int64'),
'average_income': dtype('int64'),
'purchasing_power_class': dtype('int64'),
'contrib_private': dtype('int64'),
'contrib_firms': dtype('int64'),
'contrib_agriculture': dtype('int64'),
'contrib_car_policies': dtype('int64'),
'contrib_van_policies': dtype('int64'),
'contrib_scooter_policies': dtype('int64'),
'contrib_lorry_policies': dtype('int64'),
'contrib_trailer_policies': dtype('int64'),
'contrib_tractor_policies': dtype('int64'),
'contrib_agricultural_policies': dtype('int64'),
'contrib_moped_policies': dtype('int64'),
'contrib_life_insur': dtype('int64'),
'contrib_privateaccident': dtype('int64'),
'contrib_familyaccidents': dtype('int64'),
'contrib_disability': dtype('int64'),
'contrib_fire': dtype('int64'),
'contrib_surfboard': dtype('int64'),
'contrib_boat': dtype('int64'),
'contrib_bicycle': dtype('int64'),
'contrib_property': dtype('int64'),
'contrib_socialsecurity': dtype('int64'),
'number_private_insurance': dtype('int64'),
'number_insurance_firms': dtype('int64'),
'number_insurance_agriculture': dtype('int64'),
'num_car_policies': dtype('int64'),
'num_van_policies': dtype('int64'),
'num_scooter_policies': dtype('int64'),
'num_lorry_policies': dtype('int64'),
'num_trailer_policies': dtype('int64'),
'num_tractor_policies': dtype('int64'),
'num_agricultural_policies': dtype('int64'),
'num_moped_policies': dtype('int64'),
'num_life_insurances': dtype('int64'),
'num_private_insurance_policies': dtype('int64'),
'num_family_insurance_policies': dtype('int64'),
'num_disability_policies': dtype('int64'),
'num_fire_policies': dtype('int64'),
'num_surfboard_policies': dtype('int64'),
'num_boat_policies': dtype('int64'),
'num_bicycle_policies': dtype('int64'),
'num_property_policies': dtype('int64'),
'num_social_insurance_policies': dtype('int64'),
'num_mobilehomepolicies': dtype('int64')}
El tipo original de los datos es integer, posteriormente los trataremos y estableceremos el tipo que consideramos que es mejor.
#Sumamos los valores de la variable objetivo por categoría y calculamos su porcentaje
data_train_plot = data_train['num_mobilehomepolicies']\
.value_counts(normalize = True)\
.mul(100).rename('percent').reset_index()
#Sumamos los valores de la variable objetivo por categoría
data_train_plot_conteo = data_train['num_mobilehomepolicies'].value_counts().reset_index()
#Unimos ambos resultados con la funcion merge
data_train_plot_concat = pd.merge(data_train_plot,
data_train_plot_conteo, on=['index'], how='inner')
#Los imprimimos
data_train_plot_concat
| index | percent | num_mobilehomepolicies | |
|---|---|---|---|
| 0 | 0 | 94.022673 | 5474 |
| 1 | 1 | 5.977327 | 348 |
#lo representamos en un grafico
target_plot = px.histogram(data_train_plot_concat, x="index", y=['percent'])
target_plot.show()
La distribución de la variable objetivo es la que se muestra en el gráfico. El 94,0227% de los individuos no poseen una poliza de caravana, mientras que el 5,9273% sí lo hacen. No modificamos nada en esta variable puesto que la categorización es 0 y 1, significando 0 que no poseen y 1 que sí tienen, por lo que es adecuado para realizar el análisis.
#Cogemmos los valores nulos con la funcion isnull y los sumamos y ordenamos descendentemente, por columnas y por filas.
data_train_null_columns = data_train.isnull().sum().sort_values(ascending=False)
data_train_null_rows = data_train.isnull().sum(axis=1).sort_values(ascending=False)
#Transformamos las series en data frame para verlo mejor, y creamos la columna "nulos_xxx" que contiene la suma
data_train_null_columnas = pd.DataFrame(data_train_null_columns, columns=['nulos_columnas'])
data_train_null_filas = pd.DataFrame(data_train_null_rows, columns=['nulos_filas'])
#Imprimimos el data frame agrupado por columnas, que es más sencillo visualizar los resultados
data_train_null_columnas
| nulos_columnas | |
|---|---|
| customer_subtype | 0 |
| contrib_life_insur | 0 |
| contrib_property | 0 |
| contrib_bicycle | 0 |
| contrib_boat | 0 |
| ... | ... |
| socialclass_C | 0 |
| socialclass_B2 | 0 |
| socialclass_B1 | 0 |
| socialclass_A | 0 |
| num_mobilehomepolicies | 0 |
86 rows × 1 columns
Tras analizar la existencia de nulos, podemos ver que nuestro data frame no posee ninguno, por lo que no tenemos que hacer ninguna transformación para sustituirlos. Podemos continuar con el análisis con el data frame inicial.
Para fijar las variables categóricas, no hemos usado ninguna funcion, ya que no podíamos establecer un número máximo para decidir si lo eran, dada la variabilidad de las cifras. Tras haberlas analizado, hemos decidido que de la 0 a la 64, y también la 85, serán categóricas, dado que en el caso de las del primer rango (0:65) tienen un límite fijado y cada número hace referencia a un rango determinado y la 85 es la variable objetivo y está categorizada en 0 y 1 (0 sin poliza y 1 con poliza), por lo que también la incluimos. De la 65 a la 84 conservarán el tipo integer, puesto que se refieren al número de polizas de coche, polizas de vida... que poseen los individuos y son numeros enteros.
#Creamos una lista con el nombre de las variables categoricas de la 0 a la 64. El rango se estable 0:65, dado que el ultimo
#numero, el 65, no se incluye.
var_category = list_columns[0:65]
#Añadimos con la funcion append la variable categorica que es la 85.
var_category.append(list_columns[85])
#Las transformamos a categorica con la funcion astype
data_train[var_category] = data_train[var_category].astype("category")
#Comprobamos el tipo
data_train[var_category].dtypes
customer_subtype category
houses category
avg_size_household category
avg_age category
customer_type category
...
contrib_bicycle category
contrib_property category
contrib_socialsecurity category
number_private_insurance category
num_mobilehomepolicies category
Length: 66, dtype: object
#Creamos una lista con las variables continuas
var_continuous = list_columns[65:85]
data_train[var_continuous].dtypes
number_insurance_firms int64 number_insurance_agriculture int64 num_car_policies int64 num_van_policies int64 num_scooter_policies int64 num_lorry_policies int64 num_trailer_policies int64 num_tractor_policies int64 num_agricultural_policies int64 num_moped_policies int64 num_life_insurances int64 num_private_insurance_policies int64 num_family_insurance_policies int64 num_disability_policies int64 num_fire_policies int64 num_surfboard_policies int64 num_boat_policies int64 num_bicycle_policies int64 num_property_policies int64 num_social_insurance_policies int64 dtype: object
Estas variables, como ya hemos comentado van a conservar el tipo integer, dado que informa del numero que posee el individuo de la variable en cuestión y son numeros enteros.
#Con la funcion get_deviation_of_mean_perc calculamos lo que se desvian las variables de la media.
get_deviation_of_mean_perc(data_train, var_continuous, target='num_mobilehomepolicies', multiplier=1)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_outlier_values | |
|---|---|---|---|---|---|
| 0 | 0.939024 | 0.060976 | number_insurance_firms | 82 | 0.014085 |
| 1 | 0.975000 | 0.025000 | number_insurance_agriculture | 120 | 0.020611 |
| 2 | 0.852830 | 0.147170 | num_car_policies | 265 | 0.045517 |
| 3 | 0.958333 | 0.041667 | num_van_policies | 48 | 0.008245 |
| 4 | 0.927928 | 0.072072 | num_scooter_policies | 222 | 0.038131 |
| 5 | 1.000000 | 0.000000 | num_lorry_policies | 9 | 0.001546 |
| 6 | 0.907692 | 0.092308 | num_trailer_policies | 65 | 0.011165 |
| 7 | 0.965035 | 0.034965 | num_tractor_policies | 143 | 0.024562 |
| 8 | 1.000000 | 0.000000 | num_agricultural_policies | 21 | 0.003607 |
| 9 | 0.979798 | 0.020202 | num_moped_policies | 396 | 0.068018 |
| 10 | 0.921502 | 0.078498 | num_life_insurances | 293 | 0.050326 |
| 11 | 0.967742 | 0.032258 | num_private_insurance_policies | 31 | 0.005325 |
| 12 | 0.842105 | 0.157895 | num_family_insurance_policies | 38 | 0.006527 |
| 13 | 0.826087 | 0.173913 | num_disability_policies | 23 | 0.003951 |
| 14 | 0.958645 | 0.041355 | num_fire_policies | 2805 | 0.481793 |
| 15 | 0.666667 | 0.333333 | num_surfboard_policies | 3 | 0.000515 |
| 16 | 0.606061 | 0.393939 | num_boat_policies | 33 | 0.005668 |
| 17 | 0.897959 | 0.102041 | num_bicycle_policies | 147 | 0.025249 |
| 18 | 0.888889 | 0.111111 | num_property_policies | 45 | 0.007729 |
| 19 | 0.804878 | 0.195122 | num_social_insurance_policies | 82 | 0.014085 |
La variable num_fire_policies es la que tiene mayor porcentaje de outliers, hemos decidido no hacer ningún cambio en ella, ya que tras analizarla, estos valores por encima de la media son de individuos que no poseen esa poliza, por lo que sustituirlos no sería apropiado para el modelo. En el resto tampoco vamos a hacer ninguna modificación, volveremos a esto posteriormente si vemos que podría ayudarnos a mejorar el modelo. En cuanto a la proporción de la variable objetivo en estas variables numericas, el mayor porcentaje se alcanza en num_boat_policies
#Matriz de correlacion de las variables numericas
get_corr_matrix(dataset = data_train[var_continuous],
metodo = 'pearson', size_figure = [8,6])
0
#Calculamos la matriz de correlaciones
corr = data_train[var_continuous].corr('pearson')
#Valores absolutos
new_corr = corr.abs()
#Eliminamos la parte de arriba de la matriz y nos quedamos con la de abajo
new_corr.loc[:,:] = np.tril(new_corr, k=-1)
#Lo pasamos a data frame
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
#Filtramos por las mayores de 0.5
new_corr[new_corr['correlation']>0.5]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 141 | num_tractor_policies | number_insurance_agriculture | 0.547175 |
Las correlaciones entre variables son muy bajas, como podemos ver al hacer un filtrado por las mayores de 0.5 solo tenemos la correlación entre numero de polizas de tractor y numero de polizas para agricultura. Aunque podríamos considerar que nos van a dar información muy similar, decidimos no eliminarla, dado que no es una correlación muy alta.
#Convertimos las variables categoricas a tipo integer, ya que es necesario para calcular sus correlaciones
data_train[var_category] = data_train[var_category].astype("int")
#Realizamos la matriz de correlación de la misma manera que antes, pero para las variables categóricas
get_corr_matrix(dataset = data_train[var_category],
metodo='pearson', size_figure=[8,6])
0
#Seguimos los mismos pasos que antes, pero para las categoricas
corr_category = data_train[var_category].corr('pearson')
new_corr_category = corr_category.abs()
new_corr_category.loc[:,:] = np.tril(new_corr_category, k=-1)
new_corr_category = new_corr_category.stack().to_frame('correlation').reset_index().sort_values(by='correlation',
ascending=False)
#Aprovechamos que son valores más elevados y filtramos por los mayores de 0.7
new_corr_category[new_corr_category['correlation']>0.7]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 2009 | home_owners | rented_house | 0.999554 |
| 2344 | private_health_service | national_health_service | 0.999239 |
| 264 | customer_type | customer_subtype | 0.992672 |
| 4267 | number_private_insurance | contrib_private | 0.981369 |
| 735 | other_relation | married | 0.884362 |
| 926 | with_children | avg_size_household | 0.794014 |
| 1138 | lower_education | medium_education | 0.747582 |
| 803 | singles | other_relation | 0.745642 |
| 534 | no_religion | protestant | 0.741895 |
| 2209 | no_car | 1car | 0.734564 |
#Calculamos la matriz de confusion y el coeficiente v-cramer para el target y la variable contrib_boat
confusion_matrix = pd.crosstab(data_train["num_mobilehomepolicies"], data_train["contrib_boat"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
contrib_boat 0 1 2 3 4 5 6 num_mobilehomepolicies 0 5454 2 3 3 9 2 1 1 335 3 2 2 4 0 2
0.11346381331047402
#Hacemos lo mismo en otras variables
confusion_matrix3 = pd.crosstab(data_train["num_mobilehomepolicies"], data_train["contrib_fire"])
print(confusion_matrix3)
cramers_v(confusion_matrix3.values)
contrib_fire 0 1 2 3 4 5 6 7 8 num_mobilehomepolicies 0 2557 158 529 852 1075 141 152 9 1 1 109 3 6 68 151 8 3 0 0
0.15081040147537925
for i in data_train[var_category]:
confusion_matrix = pd.crosstab(data_train["num_mobilehomepolicies"], data_train[i])
print(confusion_matrix)
print(cramers_v(confusion_matrix.values))
customer_subtype 1 2 3 4 5 6 7 8 9 10 ... \ num_mobilehomepolicies ... 0 111 76 224 50 43 107 41 288 266 156 ... 1 13 6 25 2 2 12 3 51 12 9 ... customer_subtype 32 33 34 35 36 37 38 39 40 41 num_mobilehomepolicies 0 133 764 173 206 209 122 316 309 71 200 1 8 46 9 8 16 10 23 19 0 5 [2 rows x 40 columns] 0.1214125245618953 houses 1 2 3 4 5 6 7 8 10 num_mobilehomepolicies 0 4952 472 39 2 1 1 5 1 1 1 315 33 0 0 0 0 0 0 0 0.0 avg_size_household 1 2 3 4 5 num_mobilehomepolicies 0 276 2016 2475 643 64 1 8 115 171 50 4 0.030243996590796893 avg_age 1 2 3 4 5 6 num_mobilehomepolicies 0 73 1365 2817 1009 181 29 1 1 87 183 64 12 1 0.0 customer_type 1 2 3 4 5 6 7 8 9 10 num_mobilehomepolicies 0 504 436 827 52 554 201 530 1474 625 271 1 48 66 59 0 15 4 20 89 42 5 0.11698274512917237 roman_catholic 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 3051 1492 679 145 65 17 12 6 3 4 1 177 107 54 7 1 1 1 0 0 0 0.004515659184989305 protestant 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 77 129 372 561 1520 1404 675 509 62 165 1 1 5 24 29 87 97 39 55 3 8 0.047976667994437235 other_religion 0 1 2 3 4 5 num_mobilehomepolicies 0 1873 1921 1289 239 125 27 1 130 93 99 18 7 1 0.0340144828663424 no_religion 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 419 209 986 1366 1264 912 213 92 5 8 1 37 21 69 87 70 51 4 9 0 0 0.04226640288639122 married 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 62 74 154 240 314 898 1101 1567 336 728 1 2 1 3 6 10 48 71 116 25 66 0.06133135469764114 living_together 0 1 2 3 4 5 6 7 num_mobilehomepolicies 0 2287 1910 1016 152 77 18 13 1 1 161 120 59 7 1 0 0 0 0.011961420404819938 other_relation 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1080 506 1646 1078 622 260 175 63 21 23 1 93 33 110 74 26 6 4 1 0 1 0.05681982799852275 singles 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1629 887 1172 807 498 246 123 66 23 23 1 128 64 75 41 21 13 4 1 1 0 0.0392107064097822 no_children 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 348 348 1003 1409 1367 576 294 88 13 28 1 23 24 57 89 88 30 27 8 1 1 0.0 with_children 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 148 282 596 914 1071 1048 727 328 188 172 1 5 10 39 59 66 58 56 23 18 14 0.02366955538623841 high_education 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 2056 1249 1074 508 289 166 59 45 20 8 1 91 73 70 39 37 21 8 6 2 1 0.08119222241325871 medium_education 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 409 361 889 1250 1340 684 329 139 34 39 1 14 22 48 80 86 54 19 18 3 4 0.040821845823858366 lower_education 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 270 216 602 633 800 960 824 613 241 315 1 29 27 65 47 51 49 32 27 13 8 0.09124021919390536 high_status 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1451 1184 1286 703 364 236 120 78 23 29 1 73 61 78 53 33 13 18 14 3 2 0.07271458978526386 entrepreneur 0 1 2 3 4 5 num_mobilehomepolicies 0 3938 1120 322 34 11 49 1 233 82 26 3 1 3 0.0 farmer 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 3892 818 467 137 76 58 14 3 5 4 1 284 36 20 6 1 1 0 0 0 0 0.043806512275053534 middle_management 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 632 384 1406 1326 892 398 194 156 14 72 1 35 19 85 68 61 33 17 22 0 8 0.05268784814409118 skilled_labourers 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1090 838 1296 1118 579 297 163 63 21 9 1 77 83 86 49 25 13 6 5 3 1 0.06299303482198562 unskilled_labourers 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 890 904 1357 1059 734 316 116 64 8 26 1 78 76 82 50 38 15 6 2 1 0 0.04994344919542805 socialclass_A 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1654 1489 1131 637 232 113 84 66 12 56 1 84 80 67 48 29 14 12 13 1 0 0.08608184968100004 socialclass_B1 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1281 1399 1682 710 279 74 20 5 7 17 1 72 81 101 65 19 4 5 0 1 0 0.04532307942438214 socialclass_B2 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 932 814 1576 1102 605 340 90 6 7 2 1 58 47 100 73 47 17 6 0 0 0 0.0 socialclass_C 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 339 242 799 1029 1097 1122 458 207 63 118 1 25 30 71 61 62 46 29 10 8 6 0.0662220128865837 socialclass_D 0 1 2 3 4 5 6 7 9 num_mobilehomepolicies 0 2419 1465 812 427 218 99 21 12 1 1 188 98 40 14 5 1 1 1 0 0.05578407787614905 rented_house 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 855 391 679 554 491 494 359 406 516 729 1 94 37 38 39 26 25 23 19 16 31 0.08335526859379158 home_owners 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 729 514 407 359 474 494 565 686 391 855 1 31 16 19 23 25 26 39 38 37 94 0.082883813764249 1car 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 19 14 57 224 435 1151 1572 1294 242 466 1 0 0 1 7 13 59 91 119 19 39 0.06934149015828192 2cars 0 1 2 3 4 5 6 7 num_mobilehomepolicies 0 1747 1384 1636 363 282 53 8 1 1 107 84 112 22 19 3 1 0 0.0 no_car 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1329 728 1517 1017 574 170 84 25 13 17 1 121 48 108 49 13 4 5 0 0 0 0.07520543399180181 national_health_service 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 48 15 280 162 329 902 830 1424 671 813 1 7 0 27 15 28 72 45 87 28 39 0.05469114120189934 private_health_service 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 813 671 1424 804 920 336 163 280 15 48 1 39 28 87 45 72 28 15 27 0 7 0.052551222491671 income_30 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 1206 576 1003 1029 578 551 284 149 47 51 1 98 54 91 50 21 17 9 7 1 0 0.08617595715350806 income_30_45 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 439 250 864 1073 1272 886 381 195 33 81 1 26 18 55 74 84 45 25 10 2 9 0.0 income_45_75 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 853 633 1117 1131 946 457 114 86 50 87 1 38 24 48 84 88 41 11 7 3 4 0.0716795204794355 income_75_122 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 3094 1262 678 227 133 64 8 1 4 3 1 152 97 58 19 14 7 0 0 0 1 0.056539242337743115 income_123 0 1 2 3 4 5 7 9 num_mobilehomepolicies 0 4611 713 88 35 24 1 1 1 1 289 50 8 1 0 0 0 0 0.0 average_income 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 25 48 631 1863 1715 663 331 114 62 22 1 0 1 20 69 139 70 24 17 8 0 0.10549998738432245 purchasing_power_class 1 2 3 4 5 6 7 8 num_mobilehomepolicies 0 569 410 1453 856 553 835 407 391 1 18 15 71 46 30 66 67 35 0.11417835774021888 contrib_private 0 1 2 3 num_mobilehomepolicies 0 3335 193 1937 9 1 147 8 191 2 0.0967392319560506 contrib_firms 0 1 2 3 4 5 6 num_mobilehomepolicies 0 5397 7 28 20 17 1 4 1 343 0 2 3 0 0 0 0.0 contrib_agriculture 0 2 3 4 num_mobilehomepolicies 0 5357 3 55 59 1 345 0 2 1 0.0 contrib_car_policies 0 4 5 6 7 8 num_mobilehomepolicies 0 2773 1 599 2057 41 3 1 72 0 14 262 0 0 0.1805172111397274 contrib_van_policies 0 5 6 7 num_mobilehomepolicies 0 5428 10 33 3 1 346 0 2 0 0.0 contrib_scooter_policies 0 3 4 5 6 7 num_mobilehomepolicies 0 5268 1 127 28 48 2 1 332 2 9 4 1 0 0.05665441072703155 contrib_lorry_policies 0 4 6 9 num_mobilehomepolicies 0 5465 1 7 1 1 348 0 0 0 0.0 contrib_trailer_policies 0 1 2 3 4 5 num_mobilehomepolicies 0 5415 18 33 6 1 1 1 342 1 5 0 0 0 0.0 contrib_tractor_policies 0 3 4 5 6 num_mobilehomepolicies 0 5336 77 27 26 8 1 343 2 0 2 1 0.0 contrib_agricultural_policies 0 2 3 4 6 num_mobilehomepolicies 0 5453 4 6 8 3 1 348 0 0 0 0 0.0 contrib_moped_policies 0 2 3 4 5 6 num_mobilehomepolicies 0 5086 33 276 63 15 1 1 340 1 6 0 1 0 0.0367676124226097 contrib_life_insur 0 1 2 3 4 5 6 7 8 9 num_mobilehomepolicies 0 5204 9 28 78 83 31 36 3 1 1 1 325 0 0 6 11 4 2 0 0 0 0.015145064204040964 contrib_privateaccident 0 1 2 3 4 5 6 num_mobilehomepolicies 0 5444 3 17 4 3 1 2 1 347 0 1 0 0 0 0 0.0 contrib_familyaccidents 0 2 3 num_mobilehomepolicies 0 5442 23 9 1 342 2 4 0.04623224495705996 contrib_disability 0 4 5 6 7 num_mobilehomepolicies 0 5455 1 1 15 2 1 344 0 0 4 0 0.026075963384164017 contrib_fire 0 1 2 3 4 5 6 7 8 num_mobilehomepolicies 0 2557 158 529 852 1075 141 152 9 1 1 109 3 6 68 151 8 3 0 0 0.15081040147537925 contrib_surfboard 0 1 3 num_mobilehomepolicies 0 5472 1 1 1 347 1 0 0.02919647283588463 contrib_boat 0 1 2 3 4 5 6 num_mobilehomepolicies 0 5454 2 3 3 9 2 1 1 335 3 2 2 4 0 2 0.11346381331047402 contrib_bicycle 0 1 num_mobilehomepolicies 0 5342 132 1 333 15 0.02290268344531061 contrib_property 0 1 2 3 4 5 6 num_mobilehomepolicies 0 5434 15 14 6 3 1 1 1 343 3 2 0 0 0 0 0.0 contrib_socialsecurity 0 2 3 4 5 num_mobilehomepolicies 0 5408 11 18 36 1 1 332 4 4 8 0 0.06600692666455492 number_private_insurance 0 1 2 num_mobilehomepolicies 0 3335 2133 6 1 147 201 0 0.08918644076237316 num_mobilehomepolicies 0 1 num_mobilehomepolicies 0 5474 0 1 0 348 0.998471614741666
#Volvemos a convertir las variables a categoricas
data_train[var_category] = data_train[var_category].astype("category")
Se puede ver que hay correlaciones elevadas que podríamos quedarnos con solo una de las variables, como es el caso de home_owners rented_house. Por otra parte, la variable objetivo no tiene correlaciones altas con ninguna de las variables, la más elevada es 0,1805. El coeficiente v-cramer también es bajo. Esto tampoco consideramos que sean malos resultado, solo nos indica que tendremos que incluir más variables en el modelo para explicar nuestra variable target (num_mobilehomepolicies)
data_train.to_csv("../data/processed/data_preprocessing.csv")